# coding: utf8
import re
import sys
import requests


class Spider:
    """
    China Unix Bss spider
    """
    def __init__(self, output=None):
        self.url = 'http://bbs.chinaunix.net/'
        self.shell_url = self.url + 'forum-24-1.html'
        self.contents_list = []
        self.print_flag = 1 if output else 0
        self.html_flag_converation = {
            '&nbsp;': ' ',
            '<br />':  '\n',
            '&quot': '"',
            '&lt;': '<',
            '&gt;': '>'
        }
        if output:
            self.result_output = open(output, 'w')
        else:
            self.result_output = sys.stdout

    def get_contents_list(self):
        text = requests.get(self.shell_url).text
        th_pattern = re.compile('<th(.*?)</th', re.S)
        # target_a_pattern = re.compile('(\[<a.+</a)')

        # (?#content)xxx: the pattern description of xxx
        base_content_pattern = re.compile(
            '\[<a[^>]+>(?#quesiton_type)([^<]+)[^"]+"(?#quesiton_url)([^"]+).*?>(?#quesiton_title)([^<]+)<'
        )
        for th in re.findall(th_pattern, text):
            target_a = re.findall(base_content_pattern, th)
            if target_a:
                # content_type, content_url, content_title
                self.contents_list.append(target_a[0])

    def get_real_content(self):
        num = 1
        content_pattern = re.compile('JIATHIS_CODE_HTML4">(.*?)</td', re.S)
        for content_type, content_url, content_title in self.contents_list:
            content_url = self.url + content_url
            content_text = requests.get(content_url).text
            content_result = re.findall(content_pattern, content_text)[0]

            # with blockcode
            if 'blockcode' in content_result:
                pass

            # convert html_flag
            for html_flag, replace_str in self.html_flag_converation.iteritems():
                content_result = content_result.replace(html_flag, replace_str)

            # Deal with superfluous html_flag and \n
            content_result = re.sub(r'<.*?>|\r\n|\r', '', content_result.lstrip())
            content_result = re.sub(r'\n+', '\n', content_result)

            # accroding the output, print/write the result
            self.result_output.write('-' * 30 + '\n')
            self.result_output.write('Question_title: %s\nQuestion_type: %s\nQuestion_url: %s\n' % (
                content_title.encode('utf8'),
                content_type.encode('utf8'),
                content_url.encode('utf8')
            ))
            self.result_output.write('Question_content:\n%s' % content_result.encode('utf8'))
            if self.print_flag:
                print 'Question: %02d: %s had written' % (num, content_url)
            num += 1

    def start(self):
        self.get_contents_list()
        self.get_real_content()

if __name__ == '__main__':
    file_name = None
    if len(sys.argv) == 2:
        file_name = sys.argv[1]

    spider = Spider(output=file_name)
    spider.start()

